In [41]:
%%capture
# Importing packages and functions we need
import sys
# Including cormerant
import cormerant as cmt
# Loading pandas to access the metadata 
import pandas as pd
# Using importlib to update the package as we make changes
import importlib
import os
sys.path.append('../../')
In [2]:
os.chdir('/Users/DouglasHannumJr/Downloads/mouse_hypothalamus_200um//')
os.listdir()
Out[2]:
['barcodes.csv',
 'barcodes_per_feature.csv',
 'ExportBarcodes',
 'ExportCellBoundaries',
 'ExportCellMetadata3D',
 'ExportPartitionedBarcodes',
 'feature_boundaries.pkl',
 'feature_metadata.csv',
 'md_updated.csv',
 'retained_feature_boundaries.parquet',
 'seurat.rds',
 'temp_cellBoundaries_with_md_sub.parquet']
In [3]:
import json
from copy import deepcopy

import numpy as np
import pandas as pd
import scanpy as sc
from clustergrammer2 import CGM2, Network
from IPython.display import HTML, display
from matplotlib import pyplot as plt
from observable_jupyter import embed
from observable_jupyter_widget import ObservableWidget

# For plot_3D
import rasterio
import pyarrow.parquet as pq
import geopandas as gpd
from shapely import wkt, wkb
import shapely
from rasterio.windows import Window
from rasterio.enums import Resampling
# Stops the common warning when reading in images used in 
# the plot_3d_* functions.
import warnings
warnings.filterwarnings("ignore", category=rasterio.errors.NotGeoreferencedWarning)

from PIL import Image
import base64
from io import BytesIO
import s3fs
import polars as pl
>> clustergrammer2 backend version 0.18.0
C:\Users\DouglasHannumJr\AppData\Roaming\Python\Python310\site-packages\geopandas\_compat.py:123: UserWarning: The Shapely GEOS version (3.11.1-CAPI-1.17.1) is incompatible with the GEOS version PyGEOS was compiled with (3.10.4-CAPI-1.16.2). Conversions between both will be slow.
  warnings.warn(
C:\Users\DouglasHannumJr\AppData\Local\Temp\ipykernel_4364\1441961771.py:16: UserWarning: Shapely 2.0 is installed, but because PyGEOS is also installed, GeoPandas will still use PyGEOS by default for now. To force to use and test Shapely 2.0, you have to set the environment variable USE_PYGEOS=0. You can do this before starting the Python process, or in your code before importing geopandas:

import os
os.environ['USE_PYGEOS'] = '0'
import geopandas

In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).
  import geopandas as gpd
In [ ]:
import pickle
import pandas
In [ ]:
x = pd.read_pickle('feature_boundaries.pkl')
In [ ]:
x.to_parquet('./feature_boundaries.parquet')
In [ ]:
md = pd.read_csv('./md_updated.csv')
In [ ]:
retained = x[x['id'].isin(md['name'])]
In [ ]:
md.shape
In [ ]:
x.shape
In [ ]:
retained.shape
In [ ]:
retained.to_parquet('./retained_feature_boundaries.parquet')
In [ ]:
md = pd.read_csv('./md_updated.csv', index_col=0)
In [ ]:
md[['global.x','global.y']].describe()
In [ ]:
md[(md['global.x'] < 10) & (md['global.x'] > -10) & (md['global.y'] > 240) & (md['global.y'] < 260)].iloc[1,:]
In [ ]:
 
In [4]:
directory = './'
cell_idx = '1634e56c0fd840998443f4d10b213647'
verbose = True
metadata_filename = 'md_updated.csv'
cell_categories = 'cell.type'
window_size = 200
max_number_of_cells = 100000
# z_index_number = 3
local = True
# pl_batch_size = 500000
# image_type = 'DAPI'
# image_file = 'images/mosaic_DAPI_z3.tif'
# transformation_file = 'images/micron_to_mosaic_pixel_transform.csv'
cell_boundaries_file = 'retained_feature_boundaries.parquet'
cell_id_name = 'name'
transcripts_file = 'barcodes.csv'
remove_blanks = True
z_distance = 1
cell_colors = None
init_gene = 'none'
observable_url = '@cellvisulization/2-3-3-deck-gl-image-polygon-scatter-3d'
In [5]:
md = pd.read_csv('./md_updated.csv', index_col=0)
In [6]:
cell = md.loc[cell_idx]
window = [cell['global.x'] - window_size/2,
              cell['global.x'] + window_size/2,
              cell['global.y'] - window_size/2,
              cell['global.y'] + window_size/2]
In [7]:
md[['center_x','center_y']] = md[['global.x','global.y']]
In [8]:
md_sub = deepcopy(md[(md.center_x > window[0]) &
              (md.center_x < window[1]) &
              (md.center_y > window[2]) &
              (md.center_y < window[3])])
In [9]:
parquet_file = pq.ParquetFile(cell_boundaries_file)
parquet_file
Out[9]:
<pyarrow.parquet.ParquetFile at 0x28d2c822470>
In [10]:
parquet_file.num_row_groups
Out[10]:
1
In [11]:
rg = parquet_file.read_row_group(0)
In [12]:
index_ = [x in md_sub.index for x in pd.array(rg['id'])]
In [13]:
rg = rg.filter(index_)
df = rg.to_pandas()
In [14]:
cellBoundaries = df
In [15]:
cellBoundaries['geometry'] = cellBoundaries['geometry'].apply(wkb.loads)
In [16]:
cellBoundaries = gpd.GeoDataFrame(cellBoundaries, geometry='geometry')
In [17]:
cellBoundaries.shape
Out[17]:
(21535, 14)
In [18]:
currentCells = cellBoundaries['geometry']
In [19]:
global_bounds = currentCells.total_bounds
In [20]:
cellBoundaries = cellBoundaries.merge(md_sub, left_on = 'id', right_on = 'name')
In [21]:
cellBoundaries.to_parquet('./temp_cellBoundaries_with_md_sub.parquet')
In [22]:
%%time
trx = (pl
       .scan_csv(transcripts_file,
                 rechunk=False)
       .filter((pl.col('global_x') > global_bounds[0]) &
               (pl.col('global_x') < global_bounds[2]) &
               (pl.col('global_y') > global_bounds[1]) &
               (pl.col('global_y') < global_bounds[3]))
       .select(['global_x','global_y','global_z','barcode_id'])
      ).collect().to_pandas()
CPU times: total: 44.2 s
Wall time: 12.1 s
In [23]:
min(trx.barcode_id)
Out[23]:
0
In [24]:
trx_id = pd.read_csv('./barcodes_per_feature.csv',chunksize=10)
trx_id = next(trx_id)
In [25]:
key = pd.DataFrame({'gene': trx_id.columns[1:],
              'barcode_id': range(0,216)})
In [26]:
trx = trx.merge(key, on = 'barcode_id')
In [27]:
currentCells = currentCells.reset_index(drop = True)
In [28]:
polygon_data3d = []
In [29]:
cellBoundaries['cluster_grouping'] = cellBoundaries['cell.type']
In [30]:
cellBoundaries.head()
Out[30]:
id fov_x area x y z center_z geometry center_x_x center_y_x ... doublet.score fov_y volume nFeature_RNA nCount_RNA cell.type cluster center_x_y center_y_y cluster_grouping
0 02dd707d48394665bd357f0d7db03183 22 19.678364 1810.0 618.0 9 10.0 POLYGON ((-53.660 210.850, -53.660 210.704, -5... -55.868503 209.387494 ... 0.09188 22 1692.271273 56 581 INC I2 -55.868503 209.387494 INC
1 02dd707d48394665bd357f0d7db03183 22 53.622804 1810.0 618.0 10 11.0 POLYGON ((-51.759 211.435, -51.759 211.289, -5... -55.868503 209.387494 ... 0.09188 22 1692.271273 56 581 INC I2 -55.868503 209.387494 INC
2 02dd707d48394665bd357f0d7db03183 22 73.275073 1810.0 618.0 11 12.0 POLYGON ((-51.174 212.312, -51.174 212.166, -5... -55.868503 209.387494 ... 0.09188 22 1692.271273 56 581 INC I2 -55.868503 209.387494 INC
3 02dd707d48394665bd357f0d7db03183 22 92.713451 1810.0 618.0 12 13.0 POLYGON ((-50.735 212.312, -50.735 212.166, -5... -55.868503 209.387494 ... 0.09188 22 1692.271273 56 581 INC I2 -55.868503 209.387494 INC
4 02dd707d48394665bd357f0d7db03183 22 104.314878 1810.0 618.0 13 14.0 POLYGON ((-49.711 211.435, -49.711 211.289, -4... -55.868503 209.387494 ... 0.09188 22 1692.271273 56 581 INC I2 -55.868503 209.387494 INC

5 rows × 31 columns

In [31]:
for idx in range(0, len(currentCells)):
    coordinates =[]
    cell = currentCells[idx]
    # for polygon in cell.geoms:
    coordinates.extend(list(cell.exterior.coords))
    z_level = cellBoundaries.iloc[idx].z
    
    coordinates = [list(ele) for ele in coordinates]
    
    coordinates = [[round(x,2) for x in pair] for pair in coordinates]
    
    coordinates3d = [list(list(t) + [float(z_level * z_distance)]) for t in coordinates]
    
    inst_poly3d = {'coordinates': coordinates3d,
                   'name': str(cellBoundaries.iloc[idx].id),
                   'z': float(z_level * z_distance),
                   'cell_type': str(cellBoundaries.iloc[idx].cluster_grouping),
                   'z_idx': float(z_level)
                  }
    polygon_data3d.append(inst_poly3d)
        
In [32]:
trx['cell_id'] = -1
In [33]:
trx
Out[33]:
global_x global_y global_z barcode_id gene cell_id
0 -53.321777 147.97624 1.0 0 Slc17a6 -1
1 -44.658420 156.94585 1.0 0 Slc17a6 -1
2 -43.245136 158.45552 1.0 0 Slc17a6 -1
3 -41.824020 166.70111 1.0 0 Slc17a6 -1
4 -102.173310 171.07602 1.0 0 Slc17a6 -1
... ... ... ... ... ... ...
1401810 92.265366 218.00143 176.0 158 Blank-3 -1
1401811 66.759600 164.93889 177.0 158 Blank-3 -1
1401812 114.019980 289.76685 188.0 158 Blank-3 -1
1401813 20.250843 179.71475 196.0 158 Blank-3 -1
1401814 32.352300 181.94100 199.0 158 Blank-3 -1

1401815 rows × 6 columns

In [34]:
trx_no_blanks = deepcopy(trx[~trx['gene'].str.contains('Blank')])
In [35]:
trx.shape
Out[35]:
(1401815, 6)
In [36]:
trx_no_blanks.shape
Out[36]:
(1396002, 6)
In [37]:
trx_no_blanks.head()
Out[37]:
global_x global_y global_z barcode_id gene cell_id
0 -53.321777 147.97624 1.0 0 Slc17a6 -1
1 -44.658420 156.94585 1.0 0 Slc17a6 -1
2 -43.245136 158.45552 1.0 0 Slc17a6 -1
3 -41.824020 166.70111 1.0 0 Slc17a6 -1
4 -102.173310 171.07602 1.0 0 Slc17a6 -1
In [38]:
df_obs =  deepcopy(trx_no_blanks)
df_obs.columns = ['x','y','z','barcode_id','name','partition']
df_obs['x'] = df_obs['x'].round(1)
df_obs['y'] = df_obs['y'].round(1)
df_obs['partition'] = df_obs['partition'].astype(float)
scatter_data = df_obs.to_dict('records')
In [39]:
cats = df_obs.name.unique().tolist()
colors = []
for inst_index in range(len(cats)):
    # if 'Blank' in cats[inst_index]:
    #     colors.append('#d3d3d3')
    # else:
    mod_index = inst_index % 60
    if mod_index < 20:
        color_rgba = plt.cm.tab20(mod_index % 20)
    elif mod_index < 40:
        color_rgba = plt.cm.tab20b(mod_index % 20)
    elif mod_index < 60:
        color_rgba = plt.cm.tab20c(mod_index % 20)
    inst_color = '#{:02x}{:02x}{:02x}'.format(int(color_rgba[0]*255),
                                            int(color_rgba[1]*255),
                                            int(color_rgba[2]*255))
    colors.append(inst_color)

cat_colors = dict(zip(cats, colors))

# cell colors
###################
cats =  cellBoundaries.cluster_grouping.unique().tolist()
colors = []
for inst_index in range(len(cats)):
    mod_index = inst_index % 60
    if mod_index < 20:
        color_rgba = plt.cm.tab20(mod_index % 20)
    elif mod_index < 40:
        color_rgba = plt.cm.tab20(mod_index % 20)
    elif mod_index < 60:
        color_rgba = plt.cm.tab20(mod_index % 20)
    inst_color = '#{:02x}{:02x}{:02x}'.format( int(color_rgba[0]* 255), int(color_rgba[1]* 255), int(color_rgba[2]* 255))
    colors.append(inst_color)

cell_colors = dict(zip(cats, colors))

x_offset = float(global_bounds[0])
y_offset = float(global_bounds[1])
In [42]:
zip_polygon_data3d = cmt.viz.json_zip(polygon_data3d)
zip_scatter_data = cmt.viz.json_zip(scatter_data)
In [43]:
bounds = global_bounds
In [44]:
fov_inputs = {
    # 'polygon_data3d': polygon_data3d,
    'zip_polygon_data3d': zip_polygon_data3d,
    'zip_scatter_data': zip_scatter_data,
    # 'image': image_url,
    # 'image_bounds': image_bounds,
    'center_x': np.mean([bounds[0], bounds[2]]),
    'center_y': np.mean([bounds[1], bounds[3]]),
    'zoom': -.5,
    'min_zoom': -3,
    'height': 800,
    'cat_colors': cat_colors,
    'cell_colors': cell_colors,
    'ini_opacity':1,
    'scale_z': 1
    # 'init_gene': init_gene,
}
In [45]:
bounds
Out[45]:
array([-105.60812654,  145.46162128,  115.75187066,  361.5968692 ])
In [46]:
embed('@cellvisulization/2-3-3plus-deck-gl-image-polygon-scatter-3d',
      cells=['dashboard'],
      inputs = fov_inputs)
Edit @cellvisulization/2-3-3plus-deck-gl-image-polygon-scatter-3d on Observable
In [ ]:
# embed('@cellvisulization/2-3-3plus-deck-gl-image-polygon-scatter-3d',
#       cells=['dashboard'],
#       inputs = fov_inputs)
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: